library(tidyverse)
library(highcharter)
library(factoextra)
library(sf)
library(caret)
library(lubridate)
library(patchwork)
set.seed(543)
#data is just imported here. Script for preprocessing and other wrangling and 
#imports from the Spotify API located in this folder under
#Spotify_get_script.R
billboard <- read_csv("data/spotify_full_100_10s.csv") %>% 
  rename(id = id...12)
billboard %>% 
  distinct(id, .keep_all = TRUE) %>% 
  group_by(album.name) %>% 
  hchart(type = "scatter", hcaes(energy, valence, group = album.name)) %>% 
  hc_legend(verticalAlign = "left", layout = "horizontal", x = 30, y = 15, fontSize = "10px") %>% 
  hc_title(text = "Happiness vs. Energeticness of the Top 100 Albums", align = "center")
#Getting the country codes
countries <- read_csv("data/country_codes.csv")


#Converting to a shape Object
new_data <- billboard %>% 
  left_join(y = countries, by = "country") %>% 
  group_by(album.name, country) %>% 
  summarise_if(is.numeric, mean) %>% 
  ungroup() %>% 
  st_as_sf(coords = c("Latitude", "Longitude"))

#Plotting countries in the world
hcmap(map = "custom/world-robinson-lowres",
      data = new_data,
      name = "Happiness of Music (Valence)",
      value = "valence",
      borderWidth = 0,
      nullColor = "#d3d3d3") %>%
  hc_colorAxis(
    stops = color_stops(colors = viridisLite::magma(10, begin = 0.1)),
    type = "logarithmic"
    ) %>% 
  hc_title(text = "Happiness of the Billboard Top 100 Albums Available Around the World")

What is the relationship between happiness and the other variables?

From the Spotify Dataset, there are a few parameters that are computed as functions of other parameters found in the data. These parameters are energy, loudness, and acousticness. For example, loudness is used in the calculations of energy. For this reason, these terms will be interaction terms in the regression model.

#selecting all of the numeric attributes for a linear model
model_ready <- billboard %>% 
  distinct(id, .keep_all = TRUE) %>% 
  select_if(is.numeric) %>%
  select(-c("tag_freq"))

# Setting up cross validation
ctrl <- trainControl(method = "cv", number = 10)

# trying to model the relationship between happiness and the other variables
lm_spotify <- train(valence ~ danceability + energy * loudness * acousticness + speechiness + 
                     tempo + instrumentalness + liveness + key, data = model_ready,
                    method = "lm",
                    preProc = c("scale", "center"),
                    trControl = ctrl)
summary(lm_spotify$finalModel)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.51329 -0.12136 -0.01593  0.11517  0.62896 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     0.389355   0.005472  71.160  < 2e-16 ***
## danceability                    0.092034   0.006139  14.993  < 2e-16 ***
## energy                          0.147593   0.025639   5.757 1.11e-08 ***
## loudness                       -0.058786   0.038054  -1.545   0.1227    
## acousticness                    0.102364   0.040714   2.514   0.0121 *  
## speechiness                     0.023024   0.005889   3.909 9.81e-05 ***
## tempo                           0.003168   0.005593   0.566   0.5712    
## instrumentalness               -0.011792   0.006279  -1.878   0.0606 .  
## liveness                        0.008281   0.005747   1.441   0.1499    
## key                             0.004152   0.005507   0.754   0.4510    
## `energy:loudness`               0.031487   0.021201   1.485   0.1378    
## `energy:acousticness`          -0.019365   0.024429  -0.793   0.4281    
## `loudness:acousticness`         0.064699   0.055231   1.171   0.2417    
## `energy:loudness:acousticness`  0.004288   0.025755   0.167   0.8678    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1845 on 1123 degrees of freedom
## Multiple R-squared:  0.3435, Adjusted R-squared:  0.3359 
## F-statistic: 45.19 on 13 and 1123 DF,  p-value: < 2.2e-16

Let’s take a look at these plots.

#getting the years of release for the plots
billboard_year <- billboard %>% 
  mutate(year = lubridate::year(release_date)) %>% 
  distinct(id, .keep_all = TRUE)

# plotting individual relationships as a scatter plot
hchart(billboard_year, "scatter", hcaes(x = danceability, y = valence, group = year)) %>% 
  hc_title(text = "Valence and Danceability")
hchart(billboard_year, "scatter", hcaes(x = energy, y = valence, group = year)) %>% 
  hc_title(text = "Valence and Energy")
hchart(billboard_year, "scatter", hcaes(x = speechiness, y = valence, group = year)) %>% 
  hc_title(text = "Valence and Energy")

Lets see how the model fits the data

#predicting the values of valence with the models
predictions <- predict(lm_spotify, billboard)

ggplot(billboard, aes(x = valence, y = predictions)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(x = "Valence", y = "Predictions", title = "Predictions vs. Observations of Happiness in Music")
## `geom_smooth()` using formula 'y ~ x'